Source codes: https://github.com/Anne-LiseGerard/dftd_rnaseq Data: https://doi.org/10.1007/s00018-019-03259-2
suppressPackageStartupMessages({
library("reshape2")
library("gplots")
library("DESeq2")
library("mitch")
library("limma")
library("kableExtra")
library("dplyr")
library("tidyr")
library("ComplexHeatmap")
library("RColorBrewer")
library("circlize")
library("pathview")
library("stringr")
library("readxl")
library("grid")
library("ggplot2")
library("viridis")
library("enrichplot")
})
knitr::opts_chunk$set(dev = 'svg')
Goal: re-analyse previously published DFTD tumour biopsy transcriptomes and compare to DFTD cell line transcriptomes, focusing on genes involved in metabolism. Below is a list of the samples for this project.
ss <- read.table("../ss_patchett.txt",sep="\t",fill=TRUE,header=TRUE)
ss$DFT <- as.factor(ss$DFT)
ss %>%
kbl(caption="Sample sheet for all samples") %>%
kable_paper("hover", full_width = F)
| run_accession | sample_id | DFT | sample | sample_type |
|---|---|---|---|---|
| ERR2804403 | sha_DFT1_1-1 | DFT1 | DFT1_1 | biopsy |
| ERR2804404 | sha_DFT1_1-2 | DFT1 | DFT1_1 | biopsy |
| ERR2804405 | sha_DFT1_2-1 | DFT1 | DFT1_2 | biopsy |
| ERR2804406 | sha_DFT1_2-2 | DFT1 | DFT1_2 | biopsy |
| ERR2804407 | sha_DFT2_1-1 | DFT2 | DFT2_1 | biopsy |
| ERR2804408 | sha_DFT2_1-2 | DFT2 | DFT2_1 | biopsy |
| ERR2804409 | sha_DFT2_2-1 | DFT2 | DFT2_2 | biopsy |
| ERR2804410 | sha_DFT2_2-2 | DFT2 | DFT2_2 | biopsy |
| ERR2852729 | sha_DFT2_RV_2-2 | DFT2 | RV_2 | cell_line |
| ERR2852728 | sha_DFT2_RV_2-1 | DFT2 | RV_2 | cell_line |
| ERR2852727 | sha_DFT2_RV_1-2 | DFT2 | RV_1 | cell_line |
| ERR2852726 | sha_DFT2_RV_1-1 | DFT2 | RV_1 | cell_line |
| SRR6266557 | C5065_UT_01 | DFT1 | C5065 | cell_line |
| SRR6266556 | C5065_UT_02 | DFT1 | C5065 | cell_line |
# volcanoplots
make_volcano <- function(de,name) {
sig <- subset(de,padj<0.05)
N_SIG=nrow(sig)
N_UP=nrow(subset(sig,log2FoldChange>0))
N_DN=nrow(subset(sig,log2FoldChange<0))
DET=nrow(de)
HEADER=paste(N_SIG,"@5%FDR,", N_UP, "up", N_DN, "dn", DET, "detected")
plot(de$log2FoldChange,-log10(de$padj),cex=1,pch=19,col="#D5D7E2",
main=name, xlab="log2 FC", ylab="-log10 pval")
mtext(HEADER)
grid()
points(sig$log2FoldChange,-log10(sig$padj),cex=1,pch=19,col="#5297A7")
}
#heatmaps
custom_heatmap <- function(zscore, fc, aveexpr, title) {
h1 <- Heatmap(zscore, cluster_rows = F,
column_labels = colnames(zscore), name="Z-score",
cluster_columns = T,
column_names_gp = gpar(fontsize = 7, fontface="bold"),
col=colorRamp2(c(-1.5,0,1.5), hcl_palette = "Blue-Red 2"),
heatmap_legend_param = list(title = "Z-score", at = seq(-1.5, 1.5, 0.5)))
h2 <- Heatmap(fc, row_labels = rownames(fc), row_names_gp = gpar(fontsize = 6),
cluster_rows = F, name="logFC", col = col_logFC,column_names_gp = gpar(fontsize =7,fontface="bold"),
cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
grid.text(round(as.numeric(fc[i, j],2),2), x, y,gp=gpar(fontsize=6))
}, column_title = title,
heatmap_legend_param = list(title = "logFC", at = seq(-20, 20, 10)))
h <- h1 + h2
h
}
Import data from the aligner.
tmp <- read.table("../fastq/3col_patchett.tsv.gz")
tmp$V1 <- gsub('.fastq-trimmed.fastq','',tmp$V1)
x <- as.data.frame(acast(tmp, V2~V1, value.var="V3",fun.aggregate=sum))
tmp_old <- read.table("../fastq/3col_old.tsv.gz")
tmp_old$V1 <- gsub('.fastq-trimmed.fastq','',tmp_old$V1)
x_old <- as.data.frame(acast(tmp_old, V2~V1, value.var="V3",fun.aggregate=sum))
Load gene names.
gn <- read.table("../ref/Sarcophilus_harrisii.mSarHar1.11.cdna+ncrna.genenames.tsv",fill=TRUE)
gn <- gn[order(gn$V1),]
gn_old <- read.table("../ref/Sarcophilus_harrisii.DEVIL7.0.cdna+ncrna.genenames.tsv",fill=TRUE)
gn_old <- gn_old[order(gn_old$V1),]
Load homology table.
hm <- read.table("../ref/mart_export_ensembl109_2023-07-14.txt",sep="\t",header=TRUE)
hm_old <- read.table("../ref/mart_export_ensembl101_2023-09-06.txt",sep="\t",header=TRUE)
Now need to collapse transcript data to genes.
x$gene <- paste(gn$V2,gn$V3)
y <- aggregate(. ~ gene,x,sum)
rownames(y) <- y$gene
y$gene = NULL
x_old$gene <- paste(gn_old$V2,gn_old$V3)
y_old <- aggregate(. ~ gene,x_old,sum)
rownames(y_old) <- y_old$gene
y_old$gene = NULL
Samples with <1M reads should be omitted. Will also round values to integers. Using the new genome yields a higher number of mapped reads (this data was originally mapped on the DEVIL 7.0 assembly in Patchett 2019).
cs <- colSums(y)
cs <- cs[order(cs)]
par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)
abline(v=1e7,col="red",lty=2)
y <- round(y)
cs_old <- colSums(y_old)
cs_old <- cs_old[order(cs_old)]
par(mar=c(5,10,5,2))
barplot(cs_old,main="All samples - old genome ",horiz=TRUE,las=1, xli=c(0,4e+07))
abline(v=1e7,col="red",lty=2)
y_old <- round(y_old)
This will help us to visualise the sources of variability in the overall dataset. Plot MDS. Also fix the sample names. Plot MDS by DFT. Plot MDS by cell lines and biiopsies. Decided to only include biopsies for rest of analysis.
par(mar = c(5.1, 4.1, 4.1, 2.1))
colnames(y) <- ss$sample_id
saveRDS(y, file = "y_biopsies.rds")
cs <- colSums(y)
cs <- cs[order(cs)]
par(mar=c(5,10,5,2))
barplot(cs,main="All samples",horiz=TRUE,las=1)
par(mar = c(5.1, 4.1, 4.1, 2.1))
colnames(y_old) <- ss$sample_id[1:8]
cs_old <- colSums(y_old)
cs_old <- cs_old[order(cs_old)]
par(mar=c(5,10,5,2))
barplot(cs_old,main="All samples - old genome",horiz=TRUE,las=1, xlim=c(0,4e+07))
par(mar = c(5.1, 4.1, 4.1, 2.1) )
cols <- ss$DFT
cols <- gsub("DFT1","#B3B9DF",cols)
cols <- gsub("DFT2","#DDAFB7",cols)
mymds <- plotMDS(y,plot=FALSE)
mymds_old <- plotMDS(y_old,plot=FALSE)
# fix the xlims
XMIN=min(mymds$x)
XMAX=max(mymds$x)
XMID=(XMAX+XMIN)/2
XMIN <- XMID + (XMIN-XMID)*1.1
XMAX <- XMID+(XMAX-XMID)*1.1
plotMDS(mymds,pch=17,cex=3,col=cols,main="MDS plot",xlim=c(XMIN,XMAX))
text(mymds,labels=colnames(y))
mtext("blue=DFT1, pink=DFT2")
plotMDS(mymds_old,pch=17,cex=3,col=cols,main="MDS plot - old genome",xlim=c(XMIN,XMAX))
text(mymds_old,labels=colnames(y_old))
mtext("blue=DFT1, pink=DFT2")
y_DFT1 <- y[,colnames(y) %in% c("sha_DFT1_1-1","sha_DFT1_1-2","sha_DFT1_2-1","sha_DFT1_2-2",
"C5065_UT_01", "C5065_UT_02")]
y_DFT2 <- y[,colnames(y) %in% c("sha_DFT2_1-1","sha_DFT2_1-2","sha_DFT2_2-1","sha_DFT2_2-2",
"sha_DFT2_RV_1-1","sha_DFT2_RV_1-2","sha_DFT2_RV_2-1","sha_DFT2_RV_2-2")]
y_biopsy <- y[,colnames(y) %in% c("sha_DFT1_1-1","sha_DFT1_1-2","sha_DFT1_2-1","sha_DFT1_2-2",
"sha_DFT2_1-1","sha_DFT2_1-2","sha_DFT2_2-1","sha_DFT2_2-2")]
y_cline <- y[,colnames(y) %in% c( "C5065_UT_01", "C5065_UT_02",
"sha_DFT2_RV_1-1","sha_DFT2_RV_1-2","sha_DFT2_RV_2-1","sha_DFT2_RV_2-2")]
mdsDFT1 <- plotMDS(y_DFT1,plot=FALSE)
plotMDS(mdsDFT1,pch=17,cex=3,col="#B3B9DF",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT1,labels=colnames(y_DFT1))
mtext("DFT1")
mdsDFT2 <- plotMDS(y_DFT2,plot=FALSE)
plotMDS(mdsDFT2,pch=17,cex=3,col="#DDAFB7",main="MDS plot",xlim=c(XMIN,XMAX))
text(mdsDFT2, labels=colnames(y_DFT2))
mtext("DFT2")
mds_biopsy <- plotMDS(y_biopsy,plot=FALSE)
plotMDS(mds_biopsy,pch=17,cex=3,col=cols,main="Tumour biopsies",xlim=c(XMIN,XMAX))
text(mds_biopsy,labels=colnames(y_biopsy))
mtext("Tumour biopsies: blue=DFT1, pink=DFT2")
mds_cline <- plotMDS(y_cline,plot=FALSE)
plotMDS(mds_cline,pch=17,cex=3,col=cols,main="MDS plot",xlim=c(XMIN,XMAX))
text(mds_cline,labels=colnames(y_cline))
mtext("Cell lines: blue=DFT1, pink=DFT2")
Sum replicates then run differential expression analysis. For now, ignore alignment on old genome and cell line data.
y_cell <- y[,9:14]
y <- y[,1:8]
# combine replicates for each sample
DFT1_1 <- rowSums(y[,ss$sample=="DFT1_1"])
DFT1_2 <- rowSums(y[,ss$sample=="DFT1_2"])
DFT2_1 <- rowSums(y[,ss$sample=="DFT2_1"])
DFT2_2 <- rowSums(y[,ss$sample=="DFT2_2"])
y2 <- data.frame(DFT1_1,DFT1_2,DFT2_1,DFT2_2)
ss2 <- as.data.frame(colnames(y2))
colnames(ss2) <- "sample"
ss2$DFT <- factor(c("DFT1","DFT1","DFT2","DFT2"))
y2 <- y2[which(rowMeans(y2)>10),] # remove genes with average count < 10
dim(y2)
## [1] 15931 4
dds <- DESeqDataSetFromMatrix(countData = y2 , colData = ss2, design = ~ DFT )
## converting counts to integer mode
dds2 <- dds
res <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
z <- results(res)
vsd <- vst(dds)
zz<-cbind(as.data.frame(z),assay(vsd))
dge<-as.data.frame(zz[order(zz$pvalue),])
dge2 <- dge
sig <- subset(dge,padj<0.05)
sig2 <- sig
sig2_up <- rownames(subset(sig,log2FoldChange>0))
sig2_dn <- rownames(subset(sig,log2FoldChange<0))
length(sig2_up)
## [1] 2536
length(sig2_dn)
## [1] 2033
make_volcano(dge2,"Tumour biopsies")